07-mle-and-mm

Maximum likelihood (MLE) and method of moments (MM) are two common methods for constructing a model. For this assignment, we are going to model the unknown distributions with maximum likelihood and method of moments.

Data and preparation

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
require(dplyr)
library(stats4)
Hmisc::getHdata(nhgh)
d1 <- nhgh %>% 
  filter(sex == "female") %>% 
  filter(age >= 18) %>% 
  select(gh, ht) %>% 
  filter(1:n()<=1000)

Glycohemoglobin (MLE)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

gh <- d1$gh
fun <- function(a, b)
  sum(-log(dnorm(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 6, b = 1))
## Warning in dnorm(gh, a, b): NaNs produced

## Warning in dnorm(gh, a, b): NaNs produced

## Warning in dnorm(gh, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 5.724595
b
##        b 
## 1.051692

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dnorm(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qnorm

x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, a, b)
## [1] 5.724595

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05159508

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

gh <- d1$gh
fun <- function(a, b)
  sum(-log(dgamma(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 40, b = 7))
## Warning in dgamma(gh, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 40.81883
b
##        b 
## 7.130414

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dgamma(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qgamma

x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, a, b)
## [1] 5.677929

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.04374076

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

gh <- d1$gh
fun <- function(a, b)
  sum(-log(dweibull(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 4, b = 6))
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 4.125254
b
##        b 
## 6.173884

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dweibull(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qweibull

x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, a, b)
## [1] 5.64902

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.0777423

Glycohemoglobin (MM)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
mu <- xb
sigma <- sqrt(s2)
mu
## [1] 5.7246
sigma
## [1] 1.052246

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dnorm(x, mu, sigma), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, mu, sigma), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qnorm

x <- q_candidate((1:200)/200, mu, sigma)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, mu, sigma)
## [1] 5.7246

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, mu, sigma)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05181461

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
lh <- xb/s2
ch <- xb^2/s2
lh
## [1] 5.170237
ch
## [1] 29.59754

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dgamma(x, ch, lh), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, ch, lh), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qgamma

x <- q_candidate((1:200)/200, ch, lh)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, ch, lh)
## [1] 5.660259

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, ch, lh)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05126216

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
cv2 = s2/xb^2
fun <- function(beta, cv2){
  gamma(1 + 2/beta)/(gamma(1 + 1/beta))^2 - (1 + cv2)
}
k = optimize(fun, interval = c(0, 5), cv2 = cv2, maximum = FALSE)$minimum
lambda = xb/gamma(1 + 1/k)
k
## [1] 4.999937
lambda
## [1] 6.234806

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dweibull(x, k, lambda), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, k, lambda), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qweibull

x <- q_candidate((1:200)/200, k, lambda)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, k, lambda)
## [1] 5.794122

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, k, lambda)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.06534943

Height of adult females (MLE)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

ht <- d1$ht
fun <- function(a, b)
  sum(-log(dnorm(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 160, b = 10))
## Warning in dnorm(ht, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 160.7412
b
##        b 
## 7.315558

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dnorm(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qnorm

x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, a, b)
## [1] 160.7412

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3596269

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

ht <- d1$ht
fun <- function(a, b)
  sum(-log(dgamma(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 450, b = 2))
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 480.4147
b
##        b 
## 2.988733

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dgamma(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qgamma

x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, a, b)
## [1] 160.6304

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3585818

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

ht <- d1$ht
fun <- function(a, b)
  sum(-log(dweibull(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 20, b = 160))
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 21.85402
b
##        b 
## 164.2472

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dweibull(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qweibull

x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, a, b)
## [1] 161.5156

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.4179298

Glycohemoglobin (MM)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
mu <- xb
sigma <- sqrt(s2)
mu
## [1] 160.7419
sigma
## [1] 7.320161

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dnorm(x, mu, sigma), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, mu, sigma), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qnorm

x <- q_candidate((1:200)/200, mu, sigma)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, mu, sigma)
## [1] 160.7419

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, mu, sigma)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3599606

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
lh <- xb/s2
ch <- xb^2/s2
lh
## [1] 2.999769
ch
## [1] 482.1886

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dgamma(x, ch, lh), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, ch, lh), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qgamma

x <- q_candidate((1:200)/200, ch, lh)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, ch, lh)
## [1] 160.6308

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, ch, lh)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3605536

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
cv2 = s2/xb^2
fun <- function(beta, cv2){
  gamma(1 + 2/beta)/(gamma(1 + 1/beta))^2 - (1 + cv2)
}
k = optimize(fun, interval = c(0, 100), cv2 = cv2, maximum = FALSE)$minimum
lambda = xb/gamma(1 + 1/k)
k
## [1] 99.99992
lambda
## [1] 161.6592

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dweibull(x, k, lambda), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, k, lambda), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qweibull

x <- q_candidate((1:200)/200, k, lambda)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, k, lambda)
## [1] 161.0678

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, k, lambda)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.09075406